#packages

library(tidyverse)
## Warning: a(z) 'ggplot2' csomag az R 4.4.3 verziójával lett fordítva
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## Warning: a(z) 'janitor' csomag az R 4.4.3 verziójával lett fordítva
## 
## Kapcsolódás csomaghoz: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(lubridate)
library(readr)
library(psych)
## 
## Kapcsolódás csomaghoz: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(ggplot2)
library(dplyr)
library(cowplot)
## 
## Kapcsolódás csomaghoz: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(lme4)
## A szükséges csomag betöltődik: Matrix
## 
## Kapcsolódás csomaghoz: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(sjPlot)
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
## 
## Kapcsolódás csomaghoz: 'sjPlot'
## 
## The following objects are masked from 'package:cowplot':
## 
##     plot_grid, save_plot
library(performance)

#loading data

spotify_songs <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-01-21/spotify_songs.csv')
## Rows: 32833 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): track_id, track_name, track_artist, track_album_id, track_album_na...
## dbl (13): track_popularity, danceability, energy, key, loudness, mode, speec...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#exploring data, checking for mistakes

glimpse(spotify_songs)
## Rows: 32,833
## Columns: 23
## $ track_id                 <chr> "6f807x0ima9a1j3VPbc7VN", "0r7CVbZTWZgbTCYdfa…
## $ track_name               <chr> "I Don't Care (with Justin Bieber) - Loud Lux…
## $ track_artist             <chr> "Ed Sheeran", "Maroon 5", "Zara Larsson", "Th…
## $ track_popularity         <dbl> 66, 67, 70, 60, 69, 67, 62, 69, 68, 67, 58, 6…
## $ track_album_id           <chr> "2oCs0DGTsRO98Gh5ZSl2Cx", "63rPSO264uRjW1X5E6…
## $ track_album_name         <chr> "I Don't Care (with Justin Bieber) [Loud Luxu…
## $ track_album_release_date <chr> "2019-06-14", "2019-12-13", "2019-07-05", "20…
## $ playlist_name            <chr> "Pop Remix", "Pop Remix", "Pop Remix", "Pop R…
## $ playlist_id              <chr> "37i9dQZF1DXcZDD7cfEKhW", "37i9dQZF1DXcZDD7cf…
## $ playlist_genre           <chr> "pop", "pop", "pop", "pop", "pop", "pop", "po…
## $ playlist_subgenre        <chr> "dance pop", "dance pop", "dance pop", "dance…
## $ danceability             <dbl> 0.748, 0.726, 0.675, 0.718, 0.650, 0.675, 0.4…
## $ energy                   <dbl> 0.916, 0.815, 0.931, 0.930, 0.833, 0.919, 0.8…
## $ key                      <dbl> 6, 11, 1, 7, 1, 8, 5, 4, 8, 2, 6, 8, 1, 5, 5,…
## $ loudness                 <dbl> -2.634, -4.969, -3.432, -3.778, -4.672, -5.38…
## $ mode                     <dbl> 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, …
## $ speechiness              <dbl> 0.0583, 0.0373, 0.0742, 0.1020, 0.0359, 0.127…
## $ acousticness             <dbl> 0.10200, 0.07240, 0.07940, 0.02870, 0.08030, …
## $ instrumentalness         <dbl> 0.00e+00, 4.21e-03, 2.33e-05, 9.43e-06, 0.00e…
## $ liveness                 <dbl> 0.0653, 0.3570, 0.1100, 0.2040, 0.0833, 0.143…
## $ valence                  <dbl> 0.518, 0.693, 0.613, 0.277, 0.725, 0.585, 0.1…
## $ tempo                    <dbl> 122.036, 99.972, 124.008, 121.956, 123.976, 1…
## $ duration_ms              <dbl> 194754, 162600, 176616, 169093, 189052, 16304…
colSums(is.na(spotify_songs))
##                 track_id               track_name             track_artist 
##                        0                        5                        5 
##         track_popularity           track_album_id         track_album_name 
##                        0                        0                        5 
## track_album_release_date            playlist_name              playlist_id 
##                        0                        0                        0 
##           playlist_genre        playlist_subgenre             danceability 
##                        0                        0                        0 
##                   energy                      key                 loudness 
##                        0                        0                        0 
##                     mode              speechiness             acousticness 
##                        0                        0                        0 
##         instrumentalness                 liveness                  valence 
##                        0                        0                        0 
##                    tempo              duration_ms 
##                        0                        0
#There are five tracks where the song title and artist are missing, and for these the popularity – our main variable – is 0, which is suspicious. I will remove these from the dataset.

data <- spotify_songs %>% 
  drop_na()

#distribution checks for numeric variables
hist(data$track_popularity)

hist(data$danceability)

hist(data$energy)

hist(data$loudness)

hist(data$speechiness)

hist(data$acousticness)

hist(data$instrumentalness)

hist(data$liveness)

hist(data$valence)

hist(data$tempo)

hist(data$duration_ms)

# distribution of categorical variables
pie(table(data$playlist_genre))

pie(table(data$playlist_subgenre))

pie(table(data$mode))

pie(table(data$key))

#descriptive statistics
data %>%
  select(track_popularity, danceability, energy, key, loudness, speechiness, acousticness, instrumentalness, liveness, valence, tempo, duration_ms, playlist_genre, playlist_subgenre, mode) %>%
  describe()
##                    vars     n      mean       sd    median   trimmed      mad
## track_popularity      1 32828     42.48    24.98     45.00     43.01    26.69
## danceability          2 32828      0.65     0.15      0.67      0.66     0.15
## energy                3 32828      0.70     0.18      0.72      0.71     0.19
## key                   4 32828      5.37     3.61      6.00      5.35     4.45
## loudness              5 32828     -6.72     2.99     -6.17     -6.41     2.52
## speechiness           6 32828      0.11     0.10      0.06      0.09     0.04
## acousticness          7 32828      0.18     0.22      0.08      0.13     0.11
## instrumentalness      8 32828      0.08     0.22      0.00      0.02     0.00
## liveness              9 32828      0.19     0.15      0.13      0.16     0.07
## valence              10 32828      0.51     0.23      0.51      0.51     0.27
## tempo                11 32828    120.88    26.90    121.98    119.13    26.75
## duration_ms          12 32828 225796.83 59836.49 216000.00 220378.37 47246.01
## playlist_genre*      13 32828      3.44     1.71      3.00      3.43     2.97
## playlist_subgenre*   14 32828     12.60     6.79     12.00     12.60     8.90
## mode                 15 32828      0.57     0.50      1.00      0.58     0.00
##                        min       max     range  skew kurtosis     se
## track_popularity      0.00    100.00    100.00 -0.23    -0.93   0.14
## danceability          0.00      0.98      0.98 -0.50     0.01   0.00
## energy                0.00      1.00      1.00 -0.64     0.00   0.00
## key                   0.00     11.00     11.00 -0.02    -1.31   0.02
## loudness            -46.45      1.27     47.72 -1.36     4.49   0.02
## speechiness           0.00      0.92      0.92  1.97     4.26   0.00
## acousticness          0.00      0.99      0.99  1.59     1.88   0.00
## instrumentalness      0.00      0.99      0.99  2.76     6.27   0.00
## liveness              0.00      1.00      1.00  2.08     5.07   0.00
## valence               0.00      0.99      0.99 -0.01    -0.90   0.00
## tempo                 0.00    239.44    239.44  0.53     0.08   0.15
## duration_ms        4000.00 517810.00 513810.00  1.15     2.70 330.25
## playlist_genre*       1.00      6.00      5.00  0.01    -1.27   0.01
## playlist_subgenre*    1.00     24.00     23.00  0.02    -1.18   0.04
## mode                  0.00      1.00      1.00 -0.27    -1.93   0.00
#correlation matrix for numeric variables 

num_data <- data[sapply(data, is.numeric)]
cor(num_data, use = "pairwise.complete.obs")
##                  track_popularity danceability       energy           key
## track_popularity     1.0000000000  0.064753904 -0.108983765 -0.0004048699
## danceability         0.0647539039  1.000000000 -0.086073841  0.0117706083
## energy              -0.1089837648 -0.086073841  1.000000000  0.0099724054
## key                 -0.0004048699  0.011770608  0.009972405  1.0000000000
## loudness             0.0577172461  0.025351421  0.676661769  0.0009196045
## mode                 0.0105531999 -0.058711007 -0.004778384 -0.1739811674
## speechiness          0.0070670368  0.181808452 -0.032184394  0.0224621227
## acousticness         0.0850421483 -0.024514628 -0.539732195  0.0043780155
## instrumentalness    -0.1500033849 -0.008657809  0.033282124  0.0060217005
## liveness            -0.0545933642 -0.123899361  0.161316603  0.0028337090
## valence              0.0332775379  0.330538138  0.151050467  0.0199327942
## tempo               -0.0055383266 -0.184132454  0.150072270 -0.0133158906
## duration_ms         -0.1436342100 -0.096921719  0.012560486  0.0151412053
##                       loudness         mode  speechiness acousticness
## track_popularity  0.0577172461  0.010553200  0.007067037  0.085042148
## danceability      0.0253514208 -0.058711007  0.181808452 -0.024514628
## energy            0.6766617694 -0.004778384 -0.032184394 -0.539732195
## key               0.0009196045 -0.173981167  0.022462123  0.004378015
## loudness          1.0000000000 -0.019242161  0.010312605 -0.361646363
## mode             -0.0192421607  1.000000000 -0.063446138  0.009399022
## speechiness       0.0103126054 -0.063446138  1.000000000  0.026167720
## acousticness     -0.3616463627  0.009399022  0.026167720  1.000000000
## instrumentalness -0.1478231457 -0.006759530 -0.103385447 -0.006880668
## liveness          0.0775888723 -0.005485080  0.055337384 -0.077247436
## valence           0.0534109143  0.002566773  0.064755765 -0.016832604
## tempo             0.0937612756  0.014338514  0.044649018 -0.112781539
## duration_ms      -0.1150394799  0.015575548 -0.089432207 -0.081552898
##                  instrumentalness     liveness      valence        tempo
## track_popularity     -0.150003385 -0.054593364  0.033277538 -0.005538327
## danceability         -0.008657809 -0.123899361  0.330538138 -0.184132454
## energy                0.033282124  0.161316603  0.151050467  0.150072270
## key                   0.006021700  0.002833709  0.019932794 -0.013315891
## loudness             -0.147823146  0.077588872  0.053410914  0.093761276
## mode                 -0.006759530 -0.005485080  0.002566773  0.014338514
## speechiness          -0.103385447  0.055337384  0.064755765  0.044649018
## acousticness         -0.006880668 -0.077247436 -0.016832604 -0.112781539
## instrumentalness      1.000000000 -0.005505231 -0.175405779  0.023302804
## liveness             -0.005505231  1.000000000 -0.020431976  0.020886604
## valence              -0.175405779 -0.020431976  1.000000000 -0.025638719
## tempo                 0.023302804  0.020886604 -0.025638719  1.000000000
## duration_ms           0.063256032  0.006196648 -0.032291758 -0.001347396
##                   duration_ms
## track_popularity -0.143634210
## danceability     -0.096921719
## energy            0.012560486
## key               0.015141205
## loudness         -0.115039480
## mode              0.015575548
## speechiness      -0.089432207
## acousticness     -0.081552898
## instrumentalness  0.063256032
## liveness          0.006196648
## valence          -0.032291758
## tempo            -0.001347396
## duration_ms       1.000000000
# Large overview figure: relationship between track popularity and numeric variables

base_theme <- theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 11),
    axis.title = element_text(face = "bold"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )

# Energy × track popularity
p_energy <- ggplot(data, aes(x = energy, y = track_popularity)) +
  geom_point(color = "#4E79A7", alpha = 0.3, size = 1.3) +
  geom_smooth(method = "lm", se = TRUE,
              color = "black", fill = "grey80", linewidth = 0.9) +
  labs(x = "Energy", y = "Track popularity", title = "Energy") +
  base_theme

# Danceability × track popularity
p_dance <- ggplot(data, aes(x = danceability, y = track_popularity)) +
  geom_point(color = "#59A14F", alpha = 0.3, size = 1.3) +
  geom_smooth(method = "lm", se = TRUE,
              color = "black", fill = "grey80", linewidth = 0.9) +
  labs(x = "Danceability", y = "Track popularity", title = "Danceability") +
  base_theme

# Loudness × track popularity
p_loudness <- ggplot(data, aes(x = loudness, y = track_popularity)) +
  geom_point(color = "#9C755F", alpha = 0.3, size = 1.3) +
  geom_smooth(method = "lm", se = TRUE,
              color = "black", fill = "grey80", linewidth = 0.9) +
  labs(x = "Loudness", y = "Track popularity", title = "Loudness") +
  base_theme

# Speechiness × track popularity
p_speechiness <- ggplot(data, aes(x = speechiness, y = track_popularity)) +
  geom_point(color = "#F28E2B", alpha = 0.3, size = 1.3) +
  geom_smooth(method = "lm", se = TRUE,
              color = "black", fill = "grey80", linewidth = 0.9) +
  labs(x = "Speechiness", y = "Track popularity", title = "Speechiness") +
  base_theme

# Acousticness × track popularity
p_acousticness <- ggplot(data, aes(x = acousticness, y = track_popularity)) +
  geom_point(color = "#76B7B2", alpha = 0.3, size = 1.3) +
  geom_smooth(method = "lm", se = TRUE,
              color = "black", fill = "grey80", linewidth = 0.9) +
  labs(x = "Acousticness", y = "Track popularity", title = "Acousticness") +
  base_theme

# Instrumentalness × track popularity
p_instrumentalness <- ggplot(data, aes(x = instrumentalness, y = track_popularity)) +
  geom_point(color = "#EDC948", alpha = 0.3, size = 1.3) +
  geom_smooth(method = "lm", se = TRUE,
              color = "black", fill = "grey80", linewidth = 0.9) +
  labs(x = "Instrumentalness", y = "Track popularity", title = "Instrumentalness") +
  base_theme

# Liveness × track popularity
p_liveness <- ggplot(data, aes(x = liveness, y = track_popularity)) +
  geom_point(color = "#B07AA1", alpha = 0.3, size = 1.3) +
  geom_smooth(method = "lm", se = TRUE,
              color = "black", fill = "grey80", linewidth = 0.9) +
  labs(x = "Liveness", y = "Track popularity", title = "Liveness") +
  base_theme

# Valence × track popularity
p_valence <- ggplot(data, aes(x = valence, y = track_popularity)) +
  geom_point(color = "#E15759", alpha = 0.3, size = 1.3) +
  geom_smooth(method = "lm", se = TRUE,
              color = "black", fill = "grey80", linewidth = 0.9) +
  labs(x = "Valence", y = "Track popularity", title = "Valence") +
  base_theme

# Tempo × track popularity
p_tempo <- ggplot(data, aes(x = tempo, y = track_popularity)) +
  geom_point(color = "#FF9DA7", alpha = 0.25, size = 1.2) +
  geom_smooth(method = "lm", se = TRUE,
              color = "black", fill = "grey80", linewidth = 0.9) +
  labs(x = "Tempo", y = "Track popularity", title = "Tempo") +
  base_theme

# Duration × track popularity
p_duration_ms <- ggplot(data, aes(x = duration_ms, y = track_popularity)) +
  geom_point(color = "#BAB0AC", alpha = 0.25, size = 1.2) +
  geom_smooth(method = "lm", se = TRUE,
              color = "black", fill = "grey80", linewidth = 0.9) +
  labs(x = "Duration (ms)", y = "Track popularity", title = "Duration") +
  base_theme

# Combined overview of the plots
plot_numvar <- cowplot::plot_grid(
  p_energy, p_dance,
  p_loudness, p_speechiness,
  p_acousticness, p_instrumentalness,
  p_liveness, p_valence,
  p_tempo, p_duration_ms,
  labels = c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J"),
  ncol = 2
)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
plot_numvar

# Overview of how categorical variables relate to track popularity

# Playlist subgenre × track popularity (mean ± SD)
p_subgenre_bar <- data %>%
  group_by(playlist_subgenre) %>%
  summarise(
    mean_pop = mean(track_popularity, na.rm = TRUE),
    sd_pop   = sd(track_popularity, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  ggplot(aes(x = reorder(playlist_subgenre, mean_pop), y = mean_pop)) +
  geom_col(fill = "grey85", color = "black") +
  geom_errorbar(aes(ymin = mean_pop - sd_pop, ymax = mean_pop + sd_pop), width = 0.2) +
  coord_flip() +
  labs(
    x = "Playlist subgenre",
    y = "Mean track popularity (± SD)",
    title = "Playlist subgenre"
  ) +
  base_theme

p_subgenre_bar

# Playlist genre × track popularity
p_genre <- ggplot(data, aes(x = playlist_genre, y = track_popularity)) +
  geom_boxplot(
    aes(color = playlist_genre),
    outlier.shape = NA
  ) +
  geom_jitter(
    aes(color = playlist_genre),
    width = 0.2,
    alpha = 0.05,
    size = 0.9
  ) +
  labs(
    x = "Playlist genre",
    y = "Track popularity",
    title = "Playlist genre"
  ) +
  base_theme +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

p_genre

# Mode (major vs. minor) × track popularity
p_mode <- ggplot(data, aes(x = factor(mode), y = track_popularity)) +
  geom_boxplot(
    aes(color = factor(mode)),
    outlier.shape = NA
  ) +
  geom_jitter(
    aes(color = factor(mode)),
    width = 0.15,
    alpha = 0.05,
    size = 0.9
  ) +
  scale_x_discrete(labels = c("Minor", "Major")) +
  scale_color_viridis_d(guide = "none") +
  labs(
    x = "Mode",
    y = "Track popularity",
    title = "Mode"
  ) +
  base_theme

p_mode

#Analysis plan

#Imagine that we are aspiring musicians who also know a bit of statistics, but despite all our efforts, we just cannot manage to produce a true banger hit. Instead of guessing what might work, we decide to turn to data and let statistics tell us what a song should be like in order to become as popular as possible.

#Following this idea, in this project I build and compare linear mixed-effects models to identify which musical features are the best predictors of track popularity. Mixed-effects models are particularly suitable for this task, as multiple tracks in the dataset are produced by the same artist, violating the assumption of independent observations required by standard linear regression. By including artist as a random effect, the models account for this dependency while allowing us to focus on the general relationships between song characteristics and popularity.

# Predictors were selected to reflect musical features that can be actively adjusted by musicians  (assuming that I cannot/do not want to change myself, so genre and instruments are taken as given). As energy, danceability, valence, tempo, speechiness and mode was used as predictors.

# In addition, I am curious whether it is necessary to consider multiple musical features, or whether it is sufficient that a track is easy to dance to. Therefore, I also build a simpler model to compare it with the more complex one.

#Assmuption checks for LMM ##building the model

#Mean-centering continuous predictors
data <- data %>%
  mutate(
    energy_c        = scale(energy, scale = FALSE),
    danceability_c  = scale(danceability, scale = FALSE),
    valence_c       = scale(valence, scale = FALSE),
    tempo_c         = scale(tempo, scale = FALSE),
    speechiness_c   = scale(speechiness, scale = FALSE)
  )

#Mode as categorical predictor
data$mode_f <- factor(data$mode, labels = c("Minor", "Major"))

#LMM with artist as random intercept
spotify_model <- lmer(
  track_popularity ~
    energy_c +
    danceability_c +
    valence_c +
    tempo_c +
    speechiness_c +
    mode_f +
    (1 | track_artist),
  data = data
)

##checking linearity and homoskedaticity on a residuals vs. fitted plot

#linearity check with scatterplots
plot_numvar

#the variables seems to be ok in terms of liearity

#residuals vs. fitted plot
diag_df <- data.frame(
  fitted = fitted(spotify_model),
  resid  = resid(spotify_model)
)

ggplot(diag_df, aes(x = fitted, y = resid)) +
  geom_point(color = "grey40", alpha = 0.4, size = 1.3) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
  labs(
    x = "Fitted values",
    y = "Residuals",
    title = "Linearity & homoskedasticity check: residuals vs fitted values"
  ) +
  base_theme

#Homoscedasticity seems to be violated, with a slight funnel-shaped pattern in the residuals. Track popularity is skewed, with many values close to zero. To handle this, I apply a log transformation to the outcome variable.

# Log-transform track popularity to reduce skewness and heteroscedasticity
data <- data %>%
  mutate(
    track_popularity_log = log(track_popularity + 1)
  )

# LMM with log-transformed outcome
spotify_modell_log <- lmer(
  track_popularity_log ~
    energy_c +
    danceability_c +
    valence_c +
    tempo_c +
    speechiness_c +
    mode_f +
    (1 | track_artist),
  data = data
)


#checkyng residuals vs. fitted plot again
#residuals vs. fitted plot
diag_df <- data.frame(
  fitted = fitted(spotify_modell_log),
  resid  = resid(spotify_modell_log)
)

ggplot(diag_df, aes(x = fitted, y = resid)) +
  geom_point(color = "grey40", alpha = 0.4, size = 1.3) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
  labs(
    x = "Fitted values",
    y = "Residuals",
    title = "Linearity & homoskedasticity check: residuals vs fitted values"
  ) +
  base_theme

# The log transformation did not improve the residual pattern. I therefore return to the original model with untransformed track popularity. LMMs are fairly robust, so this violation is handled as a limitation of the analysis.

##checking the normality of the residuals

#Q–Q plot of residuals
diag_df <- data.frame(resid = resid(spotify_model))

ggplot(diag_df, aes(sample = resid)) +
  stat_qq(color = "grey40", alpha = 0.6) +
  stat_qq_line(color = "black") +
  labs(
    x = "Theoretical quantiles",
    y = "Sample quantiles",
    title = "Residual Q–Q plot"
  ) +
  base_theme

#histogram of the residuals
ggplot(diag_df, aes(x = resid)) +
  geom_histogram(bins = 30, fill = "grey85", color = "black") +
  labs(
    x = "Residuals",
    y = "Count",
    title = "Residual distribution"
  ) +
  base_theme

#based on the plots, normality of the resiusals seems to be ok

##Model convergence and random-effect variance

#Model convergence and random-effect variance
summary(spotify_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: track_popularity ~ energy_c + danceability_c + valence_c + tempo_c +  
##     speechiness_c + mode_f + (1 | track_artist)
##    Data: data
## 
## REML criterion at convergence: 296314
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.0723 -0.5193  0.1095  0.6230  3.2771 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  track_artist (Intercept) 182.6    13.51   
##  Residual                 389.9    19.75   
## Number of obs: 32828, groups:  track_artist, 10692
## 
## Fixed effects:
##                  Estimate Std. Error t value
## (Intercept)     38.278893   0.241162 158.727
## energy_c       -13.868005   0.803975 -17.249
## danceability_c   3.480819   1.065912   3.266
## valence_c        6.058650   0.636827   9.514
## tempo_c          0.006744   0.004769   1.414
## speechiness_c   -0.667654   1.405035  -0.475
## mode_fMajor      0.813036   0.250380   3.247
## 
## Correlation of Fixed Effects:
##             (Intr) enrgy_ dncbl_ vlnc_c temp_c spchn_
## energy_c     0.027                                   
## danceblty_c -0.020  0.095                            
## valence_c    0.001 -0.220 -0.337                     
## tempo_c      0.001 -0.105  0.182 -0.030              
## speechnss_c -0.015  0.010 -0.058 -0.043 -0.100       
## mode_fMajor -0.585  0.009  0.017  0.000 -0.012  0.035
isSingular(spotify_model, tol = 1e-4)
## [1] FALSE
# The extended model converged properly, is not singular, and shows meaningful random-effect variance, indicating that the model assumptions are sufficiently met.

#interpreation of the complex model

spotify_model
## Linear mixed model fit by REML ['lmerMod']
## Formula: track_popularity ~ energy_c + danceability_c + valence_c + tempo_c +  
##     speechiness_c + mode_f + (1 | track_artist)
##    Data: data
## REML criterion at convergence: 296314
## Random effects:
##  Groups       Name        Std.Dev.
##  track_artist (Intercept) 13.51   
##  Residual                 19.75   
## Number of obs: 32828, groups:  track_artist, 10692
## Fixed Effects:
##    (Intercept)        energy_c  danceability_c       valence_c         tempo_c  
##      38.278893      -13.868005        3.480819        6.058650        0.006744  
##  speechiness_c     mode_fMajor  
##      -0.667654        0.813036
summary(spotify_model)  
## Linear mixed model fit by REML ['lmerMod']
## Formula: track_popularity ~ energy_c + danceability_c + valence_c + tempo_c +  
##     speechiness_c + mode_f + (1 | track_artist)
##    Data: data
## 
## REML criterion at convergence: 296314
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.0723 -0.5193  0.1095  0.6230  3.2771 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  track_artist (Intercept) 182.6    13.51   
##  Residual                 389.9    19.75   
## Number of obs: 32828, groups:  track_artist, 10692
## 
## Fixed effects:
##                  Estimate Std. Error t value
## (Intercept)     38.278893   0.241162 158.727
## energy_c       -13.868005   0.803975 -17.249
## danceability_c   3.480819   1.065912   3.266
## valence_c        6.058650   0.636827   9.514
## tempo_c          0.006744   0.004769   1.414
## speechiness_c   -0.667654   1.405035  -0.475
## mode_fMajor      0.813036   0.250380   3.247
## 
## Correlation of Fixed Effects:
##             (Intr) enrgy_ dncbl_ vlnc_c temp_c spchn_
## energy_c     0.027                                   
## danceblty_c -0.020  0.095                            
## valence_c    0.001 -0.220 -0.337                     
## tempo_c      0.001 -0.105  0.182 -0.030              
## speechnss_c -0.015  0.010 -0.058 -0.043 -0.100       
## mode_fMajor -0.585  0.009  0.017  0.000 -0.012  0.035
tab_model(spotify_model, show.std = TRUE)
  track popularity
Predictors Estimates std. Beta CI standardized CI p
(Intercept) 38.28 -0.17 37.81 – 38.75 -0.19 – -0.15 <0.001
energy c -13.87 -0.10 -15.44 – -12.29 -0.11 – -0.09 <0.001
danceability c 3.48 0.02 1.39 – 5.57 0.01 – 0.03 0.001
valence c 6.06 0.06 4.81 – 7.31 0.04 – 0.07 <0.001
tempo c 0.01 0.01 -0.00 – 0.02 -0.00 – 0.02 0.157
speechiness c -0.67 -0.00 -3.42 – 2.09 -0.01 – 0.01 0.635
mode f [Major] 0.81 0.03 0.32 – 1.30 0.01 – 0.05 0.001
Random Effects
σ2 389.88
τ00 track_artist 182.61
ICC 0.32
N track_artist 10692
Observations 32828
Marginal R2 / Conditional R2 0.014 / 0.329
# Model summary:
# Artist-level differences are large (ICC ~ 0.32), so using an LMM with artist as a random effect is justified.
#Fixed effects (within-artist associations):
#- Energy: strong negative association with popularity (higher energy -> lower popularity).
#- Danceability: small positive association (more danceable -> slightly higher popularity).
#- Valence: clear positive association (more positive mood -> higher popularity).
#- Mode: major is slightly more popular than minor.
#- Tempo and speechiness: not significant once the other predictors are in the model.
# Overall: the audio features explain only a small part of popularity (marginal R2 ~ 0.01),
# while adding artist differences boosts explained variance a lot (conditional R2 ~ 0.33).



# Overall, as a musician, I would write a song that avoids extreme energy, focuses on a positive mood and good danceability, leans slightly toward a major key, and does not worry too much about tempo or spoken elements.

#simple model

#building the model
# Simple model: focusing only on danceability
spotify_model_simple <- lmer(
  track_popularity ~
    danceability_c +
    (1 | track_artist),
  data = data
)


# 1) Linearity and homoscedasticity: residuals vs fitted values
diag_simple <- data.frame(
  fitted = fitted(spotify_model_simple),
  resid  = resid(spotify_model_simple)
)

ggplot(diag_simple, aes(x = fitted, y = resid)) +
  geom_point(color = "grey40", alpha = 0.4, size = 1.3) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
  labs(
    x = "Fitted values",
    y = "Residuals",
    title = "Simple model: residuals vs fitted values"
  ) +
  base_theme

# Homoscedasticity, similarly to the previous model, is not perfect.

#Model convergence and random-effect variance
summary(spotify_model_simple)
## Linear mixed model fit by REML ['lmerMod']
## Formula: track_popularity ~ danceability_c + (1 | track_artist)
##    Data: data
## 
## REML criterion at convergence: 296650.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.0831 -0.5157  0.1104  0.6214  3.3255 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  track_artist (Intercept) 187.2    13.68   
##  Residual                 392.9    19.82   
## Number of obs: 32828, groups:  track_artist, 10692
## 
## Fixed effects:
##                Estimate Std. Error t value
## (Intercept)      38.844      0.197 197.158
## danceability_c    7.179      0.989   7.259
## 
## Correlation of Fixed Effects:
##             (Intr)
## danceblty_c -0.013
isSingular(spotify_model_simple, tol = 1e-4)
## [1] FALSE
#The simple model converged properly, is not singular, and shows meaningful random-effect variance, indicating that the model specification is appropriate from an assumptions perspective.

#Q–Q plot of residuals
diag_df <- data.frame(resid = resid(spotify_model_simple))

ggplot(diag_df, aes(sample = resid)) +
  stat_qq(color = "grey40", alpha = 0.6) +
  stat_qq_line(color = "black") +
  labs(
    x = "Theoretical quantiles",
    y = "Sample quantiles",
    title = "Residual Q–Q plot"
  ) +
  base_theme

#histogram of the residuals
ggplot(diag_df, aes(x = resid)) +
  geom_histogram(bins = 30, fill = "grey85", color = "black") +
  labs(
    x = "Residuals",
    y = "Count",
    title = "Residual distribution"
  ) +
  base_theme

#residual normality seems acceptable


spotify_model_simple
## Linear mixed model fit by REML ['lmerMod']
## Formula: track_popularity ~ danceability_c + (1 | track_artist)
##    Data: data
## REML criterion at convergence: 296650.3
## Random effects:
##  Groups       Name        Std.Dev.
##  track_artist (Intercept) 13.68   
##  Residual                 19.82   
## Number of obs: 32828, groups:  track_artist, 10692
## Fixed Effects:
##    (Intercept)  danceability_c  
##         38.844           7.179
summary(spotify_model_simple)  
## Linear mixed model fit by REML ['lmerMod']
## Formula: track_popularity ~ danceability_c + (1 | track_artist)
##    Data: data
## 
## REML criterion at convergence: 296650.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.0831 -0.5157  0.1104  0.6214  3.3255 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  track_artist (Intercept) 187.2    13.68   
##  Residual                 392.9    19.82   
## Number of obs: 32828, groups:  track_artist, 10692
## 
## Fixed effects:
##                Estimate Std. Error t value
## (Intercept)      38.844      0.197 197.158
## danceability_c    7.179      0.989   7.259
## 
## Correlation of Fixed Effects:
##             (Intr)
## danceblty_c -0.013
tab_model(spotify_model_simple, show.std = TRUE)
  track popularity
Predictors Estimates std. Beta CI standardized CI p
(Intercept) 38.84 -0.15 38.46 – 39.23 -0.16 – -0.13 <0.001
danceability c 7.18 0.04 5.24 – 9.12 0.03 – 0.05 <0.001
Random Effects
σ2 392.90
τ00 track_artist 187.23
ICC 0.32
N track_artist 10692
Observations 32828
Marginal R2 / Conditional R2 0.002 / 0.324
#Simple model results: Danceability alone shows a clear positive association with track popularity: more danceable tracks tend to be more popular within the same artist. However, the effect size is small (marginal R2 ~ 0.002), while artist-level differences remain large (ICC ~ 0.32), indicating that danceability by itself explains only a very small part of popularity.

#comparing the two models

anova(spotify_model_simple, spotify_model)
## refitting model(s) with ML (instead of REML)
## Data: data
## Models:
## spotify_model_simple: track_popularity ~ danceability_c + (1 | track_artist)
## spotify_model: track_popularity ~ energy_c + danceability_c + valence_c + tempo_c + speechiness_c + mode_f + (1 | track_artist)
##                      npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## spotify_model_simple    4 296659 296692 -148325   296651                     
## spotify_model           9 296327 296403 -148155   296309 341.33  5  < 2.2e-16
##                         
## spotify_model_simple    
## spotify_model        ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model comparison shows that the extended model fits the data significantly better than the simple, danceability-only model (AIC and BIC values improved, and the p-value indicates a significant improvement and the extended model explains more variance, with higher marginal R² and a slightly higher conditional R²)..This suggests that track popularity depends on multiple musical features jointly, and cannot be explained by danceability alone.